Analysis by School¶

In this notebook, the approximately 14k schools in the dataset are matched with the poverty percentage in each school's respective district. The data includes per-school enrollment of males and females in various programs and high school level courses. Schools with low enrollment are removed from the analysis. The analysis consists of principal component analysis (PCA), K-Means clustering, linear discriminant analysis (LDA), correlation, covariance, and multiple regression predicting poverty rate based on enrollment.

In [1]:
batch1_query="""
SELECT 
	rls.leaid
	,rls.schid
	,rls.lea_name
	,rls.sch_name
	,rls.jj
	,advmath.tot_mathenr_advm_m AS advmath_m_enr
	,advmath.tot_mathenr_advm_f AS advmath_f_enr
	,advpl.TOT_APEXAM_NONE_M AS advpl_m_noexam
	,advpl.TOT_APEXAM_NONE_F AS advpl_f_noexam
	,alg1.TOT_ALGPASS_GS0910_M AS alg1_m_0910_passed
	,alg1.TOT_ALGPASS_GS1112_M AS alg1_m_1112_passed
	,alg1.TOT_ALGPASS_GS0910_F AS alg1_f_0910_passed
	,alg1.TOT_ALGPASS_GS1112_F AS alg1_f_1112_passed
	,alg2.tot_mathenr_alg2_m AS alg2_m_enr
	,alg2.tot_mathenr_alg2_f AS alg2_f_enr
	,bio.TOT_SCIENR_BIOL_M AS bio_m_enr
	,bio.TOT_SCIENR_BIOL_F AS bio_f_enr
	,calc.TOT_MATHENR_CALC_M AS calc_m_enr
	,calc.TOT_MATHENR_CALC_F AS calc_f_enr
	,chem.TOT_SCIENR_CHEM_M AS chem_m_enr
	,chem.TOT_SCIENR_CHEM_F AS chem_f_enr
	,dual.TOT_DUAL_M AS dual_m_enr
	,dual.TOT_DUAL_F AS dual_f_enr
FROM ref_schema.ref_lea_sch rls
JOIN data_schema.sch_advancedmathematics advmath ON advmath.combokey = rls.combokey
JOIN data_schema.sch_advancedplacement advpl ON advpl.combokey = rls.combokey
JOIN data_schema.sch_algebrai alg1 ON alg1.combokey = rls.combokey
JOIN data_schema.sch_algebraii alg2 ON alg2.combokey = rls.combokey 
JOIN data_schema.sch_biology bio ON bio.combokey = rls.combokey 
JOIN data_schema.sch_calculus calc ON calc.combokey = rls.combokey 
JOIN data_schema.sch_chemistry chem ON chem.combokey = rls.combokey 
JOIN data_schema.sch_dualenrollment dual ON dual.combokey = rls.combokey 
JOIN data_schema.sch_schoolcharacteristics chr ON chr.combokey = rls.combokey 
JOIN data_schema.saipe_ussd17 saipe ON saipe.leaid = rls.leaid
WHERE chr.hs_only = TRUE
order by leaid;
"""
batch2_query="""
SELECT 
	rls.leaid
	,enr.tot_enr_m AS total_m_enr
	,enr.tot_enr_f AS total_f_enr
	,enr.SCH_ENR_LEP_M AS enr_lep_m
	,enr.SCH_ENR_LEP_F AS enr_lep_f
	,enr.SCH_ENR_504_M AS enr_504_m
	,enr.SCH_ENR_504_F AS enr_504_f
	,enr.SCH_ENR_IDEA_M AS enr_idea_m
	,enr.SCH_ENR_IDEA_F AS enr_idea_f
	,geo.TOT_MATHENR_GEOM_M AS geo_m_enr
	,geo.TOT_MATHENR_GEOM_F AS geo_f_enr
	,phys.TOT_SCIENR_PHYS_M AS phys_m_enr
	,phys.TOT_SCIENR_PHYS_F AS phys_f_enr
	,satact.TOT_SATACT_M AS satact_m
	,satact.TOT_SATACT_F AS satact_f
	,rls.lea_state
	,saipe.totalpopulation
	,saipe.population5_17
	,saipe.population5_17inpoverty
FROM ref_schema.ref_lea_sch rls
JOIN data_schema.sch_enrollment enr ON enr.combokey = rls.combokey 
JOIN data_schema.sch_geometry geo ON geo.combokey = rls.combokey 
JOIN data_schema.sch_physics phys ON phys.combokey = rls.combokey 
JOIN data_schema.sch_satandact satact ON satact.combokey = rls.combokey 
JOIN data_schema.sch_schoolcharacteristics chr ON chr.combokey = rls.combokey 
JOIN data_schema.saipe_ussd17 saipe ON saipe.leaid = rls.leaid
WHERE chr.hs_only = TRUE
order by leaid;
"""
In [2]:
import psycopg2
db_params = {
    "database": "postgres",
    "user": "postgres",
    "password": "pwd123",
    "host": "postgres-db",
    "port": "5432"
}
connection = psycopg2.connect(**db_params)
cursor = connection.cursor()
query = "SELECT 1;"
cursor.execute(query)
result = cursor.fetchall()
print(f"Database check: {result}")
Database check: [(1,)]
In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from kneed import KneeLocator
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
In [4]:
df = pd.read_sql(batch1_query, connection)
b2 = pd.read_sql(batch2_query, connection)
/tmp/ipykernel_27527/351946651.py:1: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  df = pd.read_sql(batch1_query, connection)
/tmp/ipykernel_27527/351946651.py:2: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
  b2 = pd.read_sql(batch2_query, connection)
In [5]:
# df = pd.read_csv('batch1.csv')
# b2 = pd.read_csv('batch2.csv')
In [6]:
for col in b2.columns:
    if col not in df.columns:
        df[col] = b2[col]
In [7]:
df.columns
Out[7]:
Index(['leaid', 'schid', 'lea_name', 'sch_name', 'jj', 'advmath_m_enr',
       'advmath_f_enr', 'advpl_m_noexam', 'advpl_f_noexam',
       'alg1_m_0910_passed', 'alg1_m_1112_passed', 'alg1_f_0910_passed',
       'alg1_f_1112_passed', 'alg2_m_enr', 'alg2_f_enr', 'bio_m_enr',
       'bio_f_enr', 'calc_m_enr', 'calc_f_enr', 'chem_m_enr', 'chem_f_enr',
       'dual_m_enr', 'dual_f_enr', 'total_m_enr', 'total_f_enr', 'enr_lep_m',
       'enr_lep_f', 'enr_504_m', 'enr_504_f', 'enr_idea_m', 'enr_idea_f',
       'geo_m_enr', 'geo_f_enr', 'phys_m_enr', 'phys_f_enr', 'satact_m',
       'satact_f', 'lea_state', 'totalpopulation', 'population5_17',
       'population5_17inpoverty'],
      dtype='object')
In [8]:
exclude_cols = ['leaid', 'schid', 'lea_name', 'sch_name', 'jj', 
                'lea_state', 'totalpopulation', 'population5_17',
                'population5_17inpoverty']
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].clip(lower=0)
In [9]:
df.head()
Out[9]:
leaid schid lea_name sch_name jj advmath_m_enr advmath_f_enr advpl_m_noexam advpl_f_noexam alg1_m_0910_passed ... geo_m_enr geo_f_enr phys_m_enr phys_f_enr satact_m satact_f lea_state totalpopulation population5_17 population5_17inpoverty
0 0100005 00871 Albertville City Albertville High School False 141 182 2 3 235 ... 169 160 169 113 177 186 AL 21786 4115 1546
1 0100006 00878 Marshall County Douglas High School False 38 43 0 0 63 ... 159 117 0 0 83 59 AL 48481 8762 2495
2 0100006 00883 Marshall County Kate D Smith DAR High School False 20 21 0 0 63 ... 60 51 0 0 56 47 AL 48481 8762 2495
3 0100007 00251 Hoover City Hoover High School False 449 513 0 13 19 ... 400 367 92 46 597 592 AL 82783 14679 1038
4 0100007 01456 Hoover City Spain Park High School False 302 308 46 61 0 ... 227 191 0 0 325 322 AL 82783 14679 1038

5 rows × 41 columns

In [10]:
exclude_cols = ['leaid', 'schid', 'lea_name', 'sch_name', 'jj', 
                'lea_state', 'totalpopulation', 'population5_17',
                'population5_17inpoverty', 'total_enrollment']
enrollment_sum = df['total_m_enr'] + df['total_f_enr']
df['total_enrollment'] = enrollment_sum
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].div(enrollment_sum, axis=0).fillna(0)
In [11]:
df[enrollment_sum <= 1][['total_enrollment','leaid',
                         'sch_name','lea_state','totalpopulation']]
Out[11]:
total_enrollment leaid sch_name lea_state totalpopulation
5604 0 2509360 Peabody Veterans Memorial High MA 54172
6290 1 2703870 BECKER ALTERNATIVE LEARNING PROGRAM MN 12177
12291 1 4831040 HIDALGO CO J J A E P TX 68281
13157 1 5301410 Re-Entry High School WA 84862
13219 1 5303930 Benton County Jail WA 96801
13355 1 5307740 State Street High School WA 28167
In [12]:
df = df[enrollment_sum > 2]
df = df.reset_index(drop=True)
In [13]:
df.head()
Out[13]:
leaid schid lea_name sch_name jj advmath_m_enr advmath_f_enr advpl_m_noexam advpl_f_noexam alg1_m_0910_passed ... geo_f_enr phys_m_enr phys_f_enr satact_m satact_f lea_state totalpopulation population5_17 population5_17inpoverty total_enrollment
0 0100005 00871 Albertville City Albertville High School False 0.097308 0.125604 0.001380 0.002070 0.162181 ... 0.110421 0.116632 0.077985 0.122153 0.128364 AL 21786 4115 1546 1449
1 0100006 00878 Marshall County Douglas High School False 0.064298 0.072758 0.000000 0.000000 0.106599 ... 0.197970 0.000000 0.000000 0.140440 0.099831 AL 48481 8762 2495 591
2 0100006 00883 Marshall County Kate D Smith DAR High School False 0.044248 0.046460 0.000000 0.000000 0.139381 ... 0.112832 0.000000 0.000000 0.123894 0.103982 AL 48481 8762 2495 452
3 0100007 00251 Hoover City Hoover High School False 0.155579 0.177755 0.000000 0.004505 0.006584 ... 0.127166 0.031878 0.015939 0.206861 0.205128 AL 82783 14679 1038 2886
4 0100007 01456 Hoover City Spain Park High School False 0.180947 0.184542 0.027561 0.036549 0.000000 ... 0.114440 0.000000 0.000000 0.194727 0.192930 AL 82783 14679 1038 1669

5 rows × 42 columns

In [14]:
df['5_17_poverty_percent'] = df['population5_17inpoverty']/df['population5_17']
In [15]:
df.columns.difference(exclude_cols)
Out[15]:
Index(['5_17_poverty_percent', 'advmath_f_enr', 'advmath_m_enr',
       'advpl_f_noexam', 'advpl_m_noexam', 'alg1_f_0910_passed',
       'alg1_f_1112_passed', 'alg1_m_0910_passed', 'alg1_m_1112_passed',
       'alg2_f_enr', 'alg2_m_enr', 'bio_f_enr', 'bio_m_enr', 'calc_f_enr',
       'calc_m_enr', 'chem_f_enr', 'chem_m_enr', 'dual_f_enr', 'dual_m_enr',
       'enr_504_f', 'enr_504_m', 'enr_idea_f', 'enr_idea_m', 'enr_lep_f',
       'enr_lep_m', 'geo_f_enr', 'geo_m_enr', 'phys_f_enr', 'phys_m_enr',
       'satact_f', 'satact_m', 'total_f_enr', 'total_m_enr'],
      dtype='object')
In [16]:
# Do PCA
In [17]:
ids = df['leaid'].values
sch_names = df['sch_name'].values
lea_names = df['lea_name'].values
states = df['lea_state'].values
In [18]:
ids = df['leaid'].values

# Step 1: Subset the DataFrame
subset_df = df[df.columns.difference(exclude_cols)]
for_pca_use = df[df['total_enrollment'] > 15][df.columns.difference(exclude_cols)]

# Step 2: Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(subset_df)
pca_data = scaler.fit_transform(for_pca_use)

# Step 3: Compute covariance matrix, eigenvectors, and eigenvalues for PCA
cov_matrix = np.cov(pca_data, rowvar=False)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort eigenvectors by eigenvalue size (descending order)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvectors = eigenvectors[:, sorted_indices]
eigenvalues = eigenvalues[sorted_indices]

# Step 4: Project data onto the top 3 principal components
projected_data = np.dot(pca_data, eigenvectors[:, :3])

# Step 5: Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=projected_data[:, 0],
    y=projected_data[:, 1],
    z=projected_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>School Name: {sch}<br>LEA Name: {lea}" 
          for i, sch, lea, state in zip(ids, sch_names, lea_names, states)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)

PC1_range = [projected_data[:, 0].min(),projected_data[:, 0].max()]
PC2_range = [projected_data[:, 1].min(),projected_data[:, 1].max()]
PC3_range = [projected_data[:, 2].min(),projected_data[:, 2].max()]
for i in range(1,4):
    exec(f"extension = 0.1*(PC{i}_range[1] - PC{i}_range[0])")
    exec(f"PC{i}_range[0] -= extension")
    exec(f"PC{i}_range[1] += extension")

layout = go.Layout(
    title="Data Projected on Top 3 Principal Components",
    scene=dict(
        xaxis=dict(
            title="Principal Component 1",
            range=[projected_data[:, 0].min(), projected_data[:, 0].max()]  
        ),
        yaxis=dict(
            title="Principal Component 2"
        ),
        zaxis=dict(
            title="Principal Component 3"
        )
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)
In [19]:
extreme_PC1 = df.iloc[np.argsort(np.abs(projected_data[:, 0]))[-3:]]
extreme_PC1.T
Out[19]:
7733 9938 1513
leaid 3417430 4013560 0626880
schid 02662 00633 11700
lea_name WEST DEPTFORD TOWNSHIP SCHOOL DISTRICT GUTHRIE Nevada Joint Union High
sch_name West Deptford High School GUTHRIE HS William & Marian Ghidotti High
jj False False False
advmath_m_enr 0.095672 0.049495 0.0
advmath_f_enr 0.075171 0.061616 0.0
advpl_m_noexam 0.034169 0.022222 0.0
advpl_f_noexam 0.041002 0.046465 0.0
alg1_m_0910_passed 0.087699 0.09697 0.018519
alg1_m_1112_passed 0.005695 0.0 0.0
alg1_f_0910_passed 0.092255 0.09697 0.098765
alg1_f_1112_passed 0.005695 0.0 0.0
alg2_m_enr 0.129841 0.126263 0.0
alg2_f_enr 0.113895 0.09899 0.0
bio_m_enr 0.126424 0.147475 0.135802
bio_f_enr 0.126424 0.158586 0.197531
calc_m_enr 0.018223 0.005051 0.0
calc_f_enr 0.004556 0.019192 0.0
chem_m_enr 0.107062 0.069697 0.111111
chem_f_enr 0.094533 0.073737 0.098765
dual_m_enr 0.019362 0.007071 0.0
dual_f_enr 0.030752 0.030303 0.0
total_m_enr 0.547836 0.505051 0.388889
total_f_enr 0.452164 0.494949 0.611111
enr_lep_m 0.001139 0.014141 0.0
enr_lep_f 0.0 0.016162 0.006173
enr_504_m 0.022779 0.012121 0.018519
enr_504_f 0.019362 0.017172 0.04321
enr_idea_m 0.142369 0.09596 0.0
enr_idea_f 0.089977 0.053535 0.0
geo_m_enr 0.11959 0.113131 0.111111
geo_f_enr 0.105923 0.129293 0.092593
phys_m_enr 0.019362 0.016162 0.0
phys_f_enr 0.001139 0.015152 0.0
satact_m 0.118451 0.130303 0.0
satact_f 0.111617 0.10404 0.0
lea_state NJ OK CA
totalpopulation 21926 25434 83537
population5_17 3272 4203 3857
population5_17inpoverty 233 719 437
total_enrollment 878 990 162
5_17_poverty_percent 0.07121 0.171068 0.1133
In [20]:
pc1 = eigenvectors[:, 0]
pc2 = eigenvectors[:, 1]
In [21]:
pc1
Out[21]:
array([-0.00100284, -0.23064523, -0.23032   , -0.22651181, -0.22593376,
       -0.25565055, -0.14396451, -0.25893149, -0.14164528, -0.27570838,
       -0.27289358, -0.26962925, -0.27600864, -0.23482294, -0.23996264,
       -0.27465776, -0.27064562, -0.1689799 , -0.15492283, -0.00460761,
       -0.00484771, -0.0236042 , -0.0348998 , -0.01411565, -0.02032434,
        0.00181729, -0.01727402,  0.01024074,  0.01278079,  0.0192746 ,
        0.01617161,  0.02491917, -0.02491917])
In [22]:
df.columns.difference(exclude_cols)
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(pc1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc1[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: -0.10%
advmath_f_enr       : -23.06%
advmath_m_enr       : -23.03%
advpl_f_noexam      : -22.65%
advpl_m_noexam      : -22.59%
alg1_f_0910_passed  : -25.57%
alg1_f_1112_passed  : -14.40%
alg1_m_0910_passed  : -25.89%
alg1_m_1112_passed  : -14.16%
alg2_f_enr          : -27.57%
alg2_m_enr          : -27.29%
bio_f_enr           : -26.96%
bio_m_enr           : -27.60%
calc_f_enr          : -23.48%
calc_m_enr          : -24.00%
chem_f_enr          : -27.47%
chem_m_enr          : -27.06%
dual_f_enr          : -16.90%
dual_m_enr          : -15.49%
enr_504_f           : -0.46%
enr_504_m           : -0.48%
enr_idea_f          : -2.36%
enr_idea_m          : -3.49%
enr_lep_f           : -1.41%
enr_lep_m           : -2.03%
geo_f_enr           : 0.18%
geo_m_enr           : -1.73%
phys_f_enr          : 1.02%
phys_m_enr          : 1.28%
satact_f            : 1.93%
satact_m            : 1.62%
total_f_enr         : 2.49%
total_m_enr         : -2.49%
In [23]:
print(f"{'Column Name'.ljust(20)}: PC2 Weight")
for i in range(len(pc2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc2[i]:.2f}%")
Column Name         : PC2 Weight
5_17_poverty_percent: -14.94%
advmath_f_enr       : 5.64%
advmath_m_enr       : 5.09%
advpl_f_noexam      : 3.26%
advpl_m_noexam      : 2.92%
alg1_f_0910_passed  : -1.37%
alg1_f_1112_passed  : -8.56%
alg1_m_0910_passed  : -2.45%
alg1_m_1112_passed  : -11.94%
alg2_f_enr          : 2.19%
alg2_m_enr          : 0.80%
bio_f_enr           : 1.08%
bio_m_enr           : -1.19%
calc_f_enr          : 3.48%
calc_m_enr          : 3.95%
chem_f_enr          : 2.08%
chem_m_enr          : 1.01%
dual_f_enr          : 7.93%
dual_m_enr          : 6.76%
enr_504_f           : 21.55%
enr_504_m           : 16.59%
enr_idea_f          : -21.67%
enr_idea_m          : -35.72%
enr_lep_f           : -14.54%
enr_lep_m           : -21.57%
geo_f_enr           : 21.09%
geo_m_enr           : -0.61%
phys_f_enr          : 26.02%
phys_m_enr          : 20.77%
satact_f            : 32.56%
satact_m            : 19.55%
total_f_enr         : 39.80%
total_m_enr         : -39.80%

Comments¶

PC1 has significant % for STEM enrollment while PC2 shows significantly negative % for physics, SAT/ACT, and 504 while high % for poverty, IDEA, and LEP.

In [24]:
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(standardized_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()
No description has been provided for this image
In [25]:
knee = KneeLocator(k_range, inertia, curve="convex", direction="decreasing")

# Elbow point
optimal_k = knee.elbow

print(f"The optimal number of clusters (k) is: {optimal_k}")
The optimal number of clusters (k) is: 3
In [26]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(standardized_data)
In [27]:
enr_cols = []
unique_clusters = np.unique(df['cluster'])
print(f"{'Cluster'.ljust(10)}: Schools in Dataset")
for cluster in unique_clusters:
    count = np.sum(df['cluster'] == cluster)
    print(f"{str(cluster).ljust(10)}: {count}")
Cluster   : Schools in Dataset
0         : 59
1         : 14089
2         : 2
In [28]:
def lda(X, y):
    mean = X.mean(axis=0)
    class_labels = np.unique(y)
    m, x_m, n = [[],[],[]]
    for cl in class_labels:
        data = X[y == cl]
        m.append(data.mean(axis=0))
        x_m.append(data - m[-1])
        n.append(len(data))
    Sw = sum((xm.T @ xm) for xm in x_m)
    Sb = sum((np.outer(d,d)*n_i) for d, n_i in zip(m-mean,n))
    eigval,eigvec=np.linalg.eig(np.linalg.inv(Sw)@Sb)
    idx = np.argsort(eigval)[::-1]
    return eigval[idx],np.real(eigvec[:,idx])
In [29]:
X = standardized_data
y = df['cluster']
eigval,eigvec = lda(X, y)
X_lda = X@eigvec

if X_lda.shape[1] < 3: # Pad with 0s if dim < 3
    X_lda = np.pad(X_lda, ((0, 0), (0, 3 - X_lda.shape[1])), mode='constant')

trace = go.Scatter3d(
    x=X_lda[:, 0],
    y=X_lda[:, 1],
    z=X_lda[:, 2],
    mode='markers',
    marker=dict(size=5, color=y, opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>School Name: {sch}<br>LEA Name: {lea}" 
          for i, sch, lea, state in zip(ids, sch_names, lea_names, states)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)

layout = go.Layout(
    title="LDA Projection on Top 3 Discriminant Components",
    scene=dict(
        xaxis_title="LDA Component 1",
        yaxis_title="LDA Component 2",
        zaxis_title="LDA Component 3"
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)
In [30]:
extreme_LDA = df.iloc[np.argsort(np.abs(X_lda[:, 0]))[-3:]]
extreme_LDA.T
Out[30]:
13381 12910 12909
leaid 5308700 5101890 5101890
schid 01488 00811 00809
lea_name Tacoma School District HENRICO CO PBLC SCHS HENRICO CO PBLC SCHS
sch_name Mt Tahoma John Randolph Tucker High Highland Springs High
jj False False False
advmath_m_enr 23.0 33.2 30.0
advmath_f_enr 22.0 37.8 26.25
advpl_m_noexam 14.363636 23.0 27.75
advpl_f_noexam 13.909091 37.6 33.5
alg1_m_0910_passed 8.818182 23.4 40.75
alg1_m_1112_passed 0.454545 0.4 3.25
alg1_f_0910_passed 6.181818 17.2 39.0
alg1_f_1112_passed 0.090909 0.6 1.5
alg2_m_enr 21.454545 28.6 37.0
alg2_f_enr 18.181818 32.4 34.75
bio_m_enr 28.909091 67.2 116.75
bio_f_enr 22.090909 71.8 106.0
calc_m_enr 2.272727 5.0 9.0
calc_f_enr 2.090909 4.8 5.75
chem_m_enr 13.909091 26.4 18.0
chem_f_enr 14.727273 35.8 35.0
dual_m_enr 0.0 12.2 18.25
dual_f_enr 0.0 15.8 19.5
total_m_enr 0.636364 0.6 0.5
total_f_enr 0.363636 0.4 0.5
enr_lep_m 0.0 0.0 0.0
enr_lep_f 0.090909 0.2 0.0
enr_504_m 0.0 0.0 0.0
enr_504_f 0.0 0.0 0.0
enr_idea_m 0.454545 0.4 0.25
enr_idea_f 0.090909 0.0 0.0
geo_m_enr 0.0 0.0 0.0
geo_f_enr 0.0 0.0 0.0
phys_m_enr 0.0 0.0 0.0
phys_f_enr 0.0 0.0 0.0
satact_m 0.090909 0.0 0.0
satact_f 0.0 0.0 0.0
lea_state WA VA VA
totalpopulation 230366 327898 327898
population5_17 34753 55138 55138
population5_17inpoverty 5374 7367 7367
total_enrollment 11 5 4
5_17_poverty_percent 0.154634 0.13361 0.13361
cluster 0 2 2
In [31]:
eig1, eig2 =(eigvec.T)[:2] # column = eigvec
exclude_cols.append('cluster')
In [32]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig1[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: -0.34%
advmath_f_enr       : 10.89%
advmath_m_enr       : -15.58%
advpl_f_noexam      : -68.22%
advpl_m_noexam      : 51.22%
alg1_f_0910_passed  : 4.97%
alg1_f_1112_passed  : -2.48%
alg1_m_0910_passed  : 0.27%
alg1_m_1112_passed  : 2.22%
alg2_f_enr          : 13.82%
alg2_m_enr          : -3.24%
bio_f_enr           : 8.40%
bio_m_enr           : -32.06%
calc_f_enr          : 7.36%
calc_m_enr          : -9.44%
chem_f_enr          : -18.30%
chem_m_enr          : 22.13%
dual_f_enr          : 4.23%
dual_m_enr          : -6.09%
enr_504_f           : 0.13%
enr_504_m           : -0.09%
enr_idea_f          : 0.02%
enr_idea_m          : -0.46%
enr_lep_f           : 0.53%
enr_lep_m           : -0.61%
geo_f_enr           : 0.02%
geo_m_enr           : -0.11%
phys_f_enr          : 0.29%
phys_m_enr          : -0.49%
satact_f            : 0.07%
satact_m            : -0.26%
total_f_enr         : -0.99%
total_m_enr         : -0.47%
In [33]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig2[i]:.2f}%")
Column Name         : PC1 Weight
5_17_poverty_percent: 0.04%
advmath_f_enr       : -9.28%
advmath_m_enr       : 8.07%
advpl_f_noexam      : -13.97%
advpl_m_noexam      : 14.06%
alg1_f_0910_passed  : 30.17%
alg1_f_1112_passed  : -1.65%
alg1_m_0910_passed  : -28.92%
alg1_m_1112_passed  : 5.25%
alg2_f_enr          : 9.31%
alg2_m_enr          : 9.25%
bio_f_enr           : 47.67%
bio_m_enr           : -60.94%
calc_f_enr          : 5.21%
calc_m_enr          : 0.08%
chem_f_enr          : -27.37%
chem_m_enr          : 25.35%
dual_f_enr          : 5.26%
dual_m_enr          : -1.94%
enr_504_f           : 0.15%
enr_504_m           : -0.26%
enr_idea_f          : -0.38%
enr_idea_m          : 0.18%
enr_lep_f           : 0.10%
enr_lep_m           : -0.38%
geo_f_enr           : -0.68%
geo_m_enr           : 0.22%
phys_f_enr          : 0.07%
phys_m_enr          : -0.25%
satact_f            : -1.37%
satact_m            : 0.79%
total_f_enr         : -5.81%
total_m_enr         : -5.76%

Covariance¶

In [34]:
standardized_df = pd.DataFrame(standardized_data)
standardized_df.columns = df.columns.difference(exclude_cols)
correlation_matrix = standardized_df.cov()
In [35]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Correlation Matrix Heatmap')
plt.show()
No description has been provided for this image
In [36]:
covariance_matrix = df[df.columns.difference(exclude_cols)].cov()
In [37]:
plt.figure(figsize=(12, 8))
sns.heatmap(covariance_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Covariance Matrix Heatmap')
plt.show()
No description has been provided for this image

Multiple Regression¶

In [38]:
dependent_var = '5_17_poverty_percent'
independent_vars = df.columns.difference(exclude_cols + [dependent_var])
In [39]:
high_p_vals = ['advmath_f_enr','alg1_f_1112_passed','total_m_enr',
               'advpl_m_noexam','chem_f_enr','alg2_m_enr','enr_lep_f',
               'chem_m_enr','alg1_f_0910_passed','alg1_m_0910_passed',
               'bio_m_enr','bio_f_enr', 'advpl_f_noexam']
high_vif = ['calc_m_enr','dual_m_enr']
independent_vars = independent_vars.difference(high_p_vals).difference(high_vif)
independent_vars
Out[39]:
Index(['advmath_m_enr', 'alg1_m_1112_passed', 'alg2_f_enr', 'calc_f_enr',
       'dual_f_enr', 'enr_504_f', 'enr_504_m', 'enr_idea_f', 'enr_idea_m',
       'enr_lep_m', 'geo_f_enr', 'geo_m_enr', 'phys_f_enr', 'phys_m_enr',
       'satact_f', 'satact_m', 'total_f_enr'],
      dtype='object')
In [40]:
X = df[independent_vars]
X = sm.add_constant(X)
Y = df[dependent_var]
model = sm.OLS(Y, X).fit()
model.summary()
Out[40]:
OLS Regression Results
Dep. Variable: 5_17_poverty_percent R-squared: 0.119
Model: OLS Adj. R-squared: 0.118
Method: Least Squares F-statistic: 112.5
Date: Fri, 30 Aug 2024 Prob (F-statistic): 0.00
Time: 03:33:01 Log-Likelihood: 13476.
No. Observations: 14150 AIC: -2.692e+04
Df Residuals: 14132 BIC: -2.678e+04
Df Model: 17
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 0.1185 0.010 12.285 0.000 0.100 0.137
advmath_m_enr -0.0258 0.003 -8.309 0.000 -0.032 -0.020
alg1_m_1112_passed -0.0926 0.019 -4.796 0.000 -0.130 -0.055
alg2_f_enr 0.0357 0.004 9.521 0.000 0.028 0.043
calc_f_enr -0.0699 0.008 -8.419 0.000 -0.086 -0.054
dual_f_enr 0.0023 0.004 0.644 0.519 -0.005 0.009
enr_504_f -0.8270 0.062 -13.303 0.000 -0.949 -0.705
enr_504_m -0.2709 0.051 -5.286 0.000 -0.371 -0.170
enr_idea_f -0.0786 0.033 -2.400 0.016 -0.143 -0.014
enr_idea_m 0.1681 0.021 8.147 0.000 0.128 0.209
enr_lep_m 0.2229 0.015 14.957 0.000 0.194 0.252
geo_f_enr -0.0520 0.026 -1.976 0.048 -0.104 -0.000
geo_m_enr 0.1216 0.023 5.361 0.000 0.077 0.166
phys_f_enr 0.2804 0.031 9.187 0.000 0.221 0.340
phys_m_enr -0.3338 0.027 -12.155 0.000 -0.388 -0.280
satact_f -0.1237 0.022 -5.666 0.000 -0.167 -0.081
satact_m -0.0735 0.021 -3.462 0.001 -0.115 -0.032
total_f_enr 0.1653 0.019 8.739 0.000 0.128 0.202
Omnibus: 2255.955 Durbin-Watson: 0.776
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3998.293
Skew: 1.030 Prob(JB): 0.00
Kurtosis: 4.593 Cond. No. 112.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [41]:
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
vif_data
Out[41]:
Variable VIF
0 const 150.839370
1 advmath_m_enr 5.668495
2 alg1_m_1112_passed 1.461331
3 alg2_f_enr 9.425364
4 calc_f_enr 2.625509
5 dual_f_enr 2.022807
6 enr_504_f 2.033253
7 enr_504_m 2.070474
8 enr_idea_f 2.319287
9 enr_idea_m 2.817331
10 enr_lep_m 1.067537
11 geo_f_enr 2.664006
12 geo_m_enr 2.584946
13 phys_f_enr 4.389944
14 phys_m_enr 4.375656
15 satact_f 4.005218
16 satact_m 3.688950
17 total_f_enr 2.816396
In [42]:
if cursor:
    cursor.close()
if connection:
    connection.close()